This document demonstrates the use of the bRF and LASSO-D3S functions for integrative GRN inference.

Those functions infer the regulatory pathways of Arabidopsis thaliana’s roots in response to nitrate (N) induction from Varala et al., 2018.

They use as inputs the expression profiles of N-responsive genes and TFBS information. Prior TFBS information was built by searching in the promoters of the N-responsive genes the PWM of the N-responsive regulators.

Data import

Expression data

load('rdata/inference_input_N_response_varala.rdata')
genes <- input_data$grouped_genes; length(genes)
## [1] 1426
tfs <- input_data$grouped_regressors; length(tfs)
## [1] 201
counts <- input_data$counts; dim(counts)
## [1] 1426   45

TFBS data

load("rdata/pwm_occurrences_N_response_varala.rdata")
dim(pwm_occurrence)
## [1] 1426  201

GRN inference

How does the individual MSE of genes varies with \(\alpha\) for model estimation in GRN inferences?

bRf : biased Random Forests

# ALPHAS <- seq(0,1, by = 0.1)
# lmses <- data.frame(row.names = genes)
# 
# for(alpha in ALPHAS){
#   set.seed(121314)
#   lmses[,as.character(alpha)] <- bRF_inference_MSE(counts, genes, tfs, alpha = alpha, nTrees = 1000,
#                              pwm_occurrence = pwm_occurrence, nCores = 18)
# }
# 
# save(lmses, file = "results/logMSES_RFs_per_genes.rdata")

Clustering the genes :

load(file = "rdata/logMSES_RFs_per_genes_400trees.rdata")

mse <- (lmses-rowMeans(lmses)) / matrixStats::rowSds(as.matrix(lmses))

ha = HeatmapAnnotation(
    alpha = anno_simple(as.numeric(rep(colnames(mse),1))),
    annotation_name_side = "left")



heatmap_rf_4000 <- Heatmap(mse, row_km = 3, cluster_columns = F, show_row_names = F, 
        name = "scaled logMSE (z-score)", top_annotation = ha,
        row_title = "Genes", column_title = "alpha");heatmap_rf_4000

LASSO-D3S

# lmses_lasso <- data.frame(row.names = genes)
# 
# for(alpha in ALPHAS){
#   set.seed(121314)
#   lmses_lasso[,as.character(alpha)] <- LASSO.D3S_inference_MSE(counts, genes, tfs,
#                                                                normalized = TRUE, 
#                                                                alpha = alpha, N = 50,
#                                pwm_occurrence = pwm_occurrence, nCores = 15)
# }
# 
# save(lmses_lasso, file = "results/logMSES_Lasso_per_genes.rdata")
load(file = "results/logMSES_Lasso_per_genes.rdata")
mse_lasso <- (lmses_lasso-rowMeans(lmses_lasso)) / matrixStats::rowSds(as.matrix(lmses_lasso))

ha = HeatmapAnnotation(
    alpha = anno_simple(as.numeric(rep(colnames(mse),1))),
    annotation_name_side = "left")


heatmap_lasso <- Heatmap(mse_lasso, row_km = 3, cluster_columns = F, show_row_names = F, 
        name = "scaled logMSE (z-score)", top_annotation = ha,
        row_title = "Genes", column_title = "alpha"); heatmap_lasso

Clustering global des gènes suivant alpha et les deux méthodes

Il semblerait que les gènes dont la MSE diminue ne soient pas les mêmes suivant bRF et LASSO, seul un petit sous ensemble est commun:

hall = HeatmapAnnotation(
  method = c(rep("bRF", 11), rep("lasso", 11)),
    alpha = anno_simple(as.numeric(rep(colnames(mse),2))),
    annotation_name_side = "left")

Heatmap(cbind(mse, mse_lasso), row_km = 6, cluster_columns = F, show_row_names = F, 
        name = "scaled logMSE", top_annotation = hall, clustering_distance_rows = "spearman",
        row_title = "Genes")

Y-a-til des gènes qu’on n’arrive pas à bien prédire, dans aucun cas?

On aimerait exclure ou en tout cas identifier dans ces graphiques s’il y a des gènes que l’on prédit mal quel que soit alpha.

En fait les gènes sur lesquels on se trompe le plus sont aussi les gènes les plus exprimés… Donc on va normaliser la MSE par la variance du gène cible pour avoir des MSE comparables entre gènes.

mean_expr <- rowMeans(counts)
var_expr <- matrixStats::rowSds(counts)*matrixStats::rowSds(counts)

all_mses <- cbind(lmses, lmses_lasso)
#all_mses_scaled <- (all_mses-rowMeans(all_mses)) / matrixStats::rowSds(as.matrix(all_mses))
all_mses_scaled <- exp(all_mses) / var_expr

min_mses <- matrixStats::rowMins(as.matrix(all_mses_scaled))

min_mses_lasso <- matrixStats::rowMins(as.matrix(exp(lmses_lasso)/var_expr))
min_mses_rf <- matrixStats::rowMins(as.matrix(exp(lmses)/var_expr))

hist(min_mses_lasso, breaks = 100)

hist(min_mses_rf, breaks = 100)

hist(min_mses, breaks = 100)

# df <- data.frame(mean_expr)
bad_genes_lasso <- genes[which(min_mses_lasso > 0.3)]
bad_genes_rf <- genes[which(min_mses_rf > 0.3)]


bad_genes_lasso <- min_mses_lasso > 0.3
bad_genes_rf <- min_mses_rf > 0.3

# df$bad <- rownames(df) %in% bad_genes
# df$min_lmse <- min_mses
# ggplot(df, aes(x=bad, y=mean_expr)) + geom_boxplot()
# ggplot(df, aes(x=log(mean_expr), y=min_lmse)) + geom_point()
# 
mse_lasso_scaled <- exp(lmses_lasso)/var_expr
mse_rf_scaled <- exp(lmses)/var_expr


# mse_lasso_scaled[bad_genes_lasso>0.3,]<-NA
# mse_rf_scaled[bad_genes_rf>0.3,]<-NA
# 

# 
# ha = HeatmapAnnotation(
#     alpha = anno_simple(as.numeric(rep(colnames(mse),1))),
#     annotation_name_side = "left")
# 
# Heatmap(mse_lasso_scaled, row_km = 0, cluster_columns = F, show_row_names = F, 
#         name = "scaled logMSE (z-score)", top_annotation = ha, 
#         row_title = "Genes", column_title = "alpha") + rowAnnotation(
#     scaled_min_mse = min_mses_lasso)
# 
# Heatmap(mse_rf_scaled, row_km = 0, cluster_columns = F, show_row_names = F, 
#         name = "scaled logMSE (z-score)", top_annotation = ha,
#         row_title = "Genes", column_title = "alpha")+ rowAnnotation(
#     scaled_min_mse = min_mses_rf)

Sans normaliser par la MSE moyenne par gène, on ne voit plus les variations de MSE liées à alpha, on voit uniquement les gènes qui sont globalement bien ou mal prédits. Deux méthodes simultanément :

Heatmap(log(all_mses_scaled), row_km = 0, cluster_columns = F, show_row_names = F, 
        name = "MSE/Var", top_annotation = hall, col = rev(hcl.colors(5, palette = "Reds 2")),
        row_title = "Genes")+ rowAnnotation(
    scaled_min_mse = min_mses)

Je remets donc un z-score sur la MSE normalisée par gène englobant les deux méthodes, pour visualiser plus précisément les variations de MSE suivant alpha:

Heatmap((all_mses_scaled-rowMeans(all_mses_scaled))/matrixStats::rowSds(as.matrix(all_mses_scaled)), 
        row_km = 0, cluster_columns = F, show_row_names = F, 
        name = "MSE/Var", top_annotation = hall,
        row_title = "Genes")+ rowAnnotation(
    scaled_min_mse = min_mses)

En fait, comme le z-score est calculé sur les deux méthodes, on a encore une grosse contribution à la couleur de la différence de MSE inhérente aux deux méthodes, et non à l’effet de l’intégration. Je calcule donc un z-score par méthodes pour faire la représentation:

mse_lasso_scaled_zscore <- (mse_lasso_scaled-rowMeans(mse_lasso_scaled, na.rm = T))/matrixStats::rowSds(as.matrix(mse_lasso_scaled, na.rm=T))
mse_rf_scaled_zscore <- (mse_rf_scaled-rowMeans(mse_rf_scaled))/matrixStats::rowSds(as.matrix(mse_rf_scaled))

col_fun = circlize::colorRamp2(c(-0.8, 0, 0.3), c("blue", "white", "red"))


Heatmap(cbind(mse_rf_scaled_zscore, mse_lasso_scaled_zscore), 
        row_km = 0, cluster_columns = F, show_row_names = F, 
        name = "MSE/Var", top_annotation = hall, 
        row_title = "Genes")+ rowAnnotation(
    scaled_min_mse = min_mses_lasso-min_mses_rf, col = list(scaled_min_mse = col_fun))

Heatmap(cbind(mse_rf_scaled_zscore, mse_lasso_scaled_zscore), 
        row_km = 0, cluster_columns = F, show_row_names = F, 
        name = "MSE/Var", top_annotation = hall, 
        row_title = "Genes")+ rowAnnotation(
    bad_for_lasso = bad_genes_lasso,
    bad_for_rf = bad_genes_rf)

c’est informatif : certains gènes bénéficient de l’intégration de données et d’autres non, et ce n’est pas spécialement lié à si on les prédit bien globalement. Par contre celà dépend visiblement de la méthode utilisée et nous n’avons pas encore d’explication pour ça…

Faire des clusters informatifs et clairs pour notre question

Définir des groupes suivant l’optimum comme par exemple:

get_optimum <- function(gene, method = "rf"){
  if(method=="rf")
    return(as.numeric(names(which.min(mse_rf_scaled_zscore[gene,]))))
  if(method=="lasso")
    return(as.numeric(names(which.min(mse_lasso_scaled_zscore[gene,]))))
}

assign_cluster <- function(opt){
  if(opt == 0)
    return("0")
  if(opt>0 & opt <=0.4){
    return("0.1-0.4")
  }
  if(opt>0.4 & opt <=0.7){
    return("0.5-0.7")
  }
  if(opt>0.7 & opt <=1){
    return("0.8-1")
  }
}
clusters_rf <- sapply(sapply(genes, get_optimum, method = "rf"), assign_cluster)
clusters_lasso <- sapply(sapply(genes, get_optimum, method = "lasso"), assign_cluster)
table(clusters_lasso)
## clusters_lasso
##       0 0.1-0.4 0.5-0.7   0.8-1 
##     635     474     197     120
table(clusters_rf)
## clusters_rf
##       0 0.1-0.4 0.5-0.7   0.8-1 
##     440     496     218     272
Heatmap(cbind(mse_rf_scaled_zscore, mse_lasso_scaled_zscore), 
        row_km = 0, cluster_columns = F, show_row_names = F, 
        name = "MSE/Var", top_annotation = hall, 
        row_title = "Genes")+ rowAnnotation(
    clusters_rf = clusters_rf,
    clusters_lasso = clusters_lasso, 
    col=list(clusters_lasso = setNames(c("red","orange", "yellow", "green"), 
                                       nm = names(table(clusters_lasso))),
             clusters_rf= setNames(c("red","orange", "yellow", "green"), 
                                       nm = names(table(clusters_lasso))) ))

draw_specific_genes <- function(genes){
  Heatmap(cbind(mse_rf_scaled_zscore, mse_lasso_scaled_zscore)[genes,], 
        row_km = 0, cluster_columns = F, show_row_names = F, 
        name = "MSE/Var", top_annotation = hall, 
        row_title = "Genes")+ rowAnnotation(
    clusters_rf = clusters_rf[genes],
    clusters_lasso = clusters_lasso[genes], 
    col=list(clusters_lasso = setNames(c("red","orange", "yellow", "green"), 
                                       nm = names(table(clusters_lasso))),
             clusters_rf= setNames(c("red","orange", "yellow", "green"), 
                                       nm = names(table(clusters_lasso))) ))
}
all_pwm_pos <- names(which(clusters_lasso == "0.8-1" & clusters_rf == "0.8-1"))
lasso_bad_rf_good <- names(which(clusters_lasso == "0" & clusters_rf == "0.8-1"))
lasso_good_rf_bad <- names(which(clusters_lasso == "0.8-1" & clusters_rf == "0"))

all_yellow <-  names(which(clusters_lasso == "0.5-0.7" & clusters_rf == "0.5-0.7"))
all_orange <-  names(which(clusters_lasso == "0.1-0.4" & clusters_rf == "0.1-0.4"))
all_red <-  names(which(clusters_lasso == "0" & clusters_rf == "0"))

different <- names(which(clusters_lasso != clusters_rf))
draw_specific_genes(all_pwm_pos)

draw_specific_genes(c(lasso_bad_rf_good, lasso_good_rf_bad))

draw_specific_genes(all_yellow)

draw_specific_genes(all_orange)

draw_specific_genes(all_red)

draw_specific_genes(different)

Quelles sont les caractéristiques des clusters de gènes

Nombre de motifs dans le promoteur, niveau d’expression, ontologies, sont-ils des TFs? leur taille?

Quelle est la précision rappel des sous réseaux concernant ces gènes.

load("rdata/pwm_prom_jaspar_dap.rdata")

genes_info <- data.frame(genes = genes, 
                         cluster_lasso = clusters_lasso[genes], 
                         cluster_rf = clusters_rf[genes])
genes_info$is_tf <- genes %in% tfs
genes_info$var <- var_expr
genes_info$expr <- mean_expr
genes_info$nb_motifs <- table(pwm_prom$target)[genes]

genes_info%>%
  mutate(cluster_lasso_label=paste("lasso :", cluster_lasso))%>%
ggplot(aes(x=cluster_rf, y=nb_motifs, 
                       fill = cluster_rf==cluster_lasso)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  facet_wrap(~cluster_lasso_label )+ xlab("cluster RF") +
  ggtitle(("Number of motifs in promoter"))
## Don't know how to automatically pick scale for object of type <table>.
## Defaulting to continuous.

genes_info%>%
  mutate(cluster_lasso_label=paste("lasso :", cluster_lasso))%>%
  ggplot(aes(x=cluster_rf, y=var, 
                       fill = cluster_rf==cluster_lasso)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  facet_wrap(~cluster_lasso_label )+ xlab("cluster RF") +
  ggtitle(("Gene variance for all groups")) + scale_y_log10()

genes_info%>%
  ggplot(aes(x=cluster_rf, y=var)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene variance for RF groups")) + scale_y_log10()+
  stat_compare_means()

genes_info%>%
  ggplot(aes(x=cluster_lasso, y=var)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene variance for LASSO groups")) + scale_y_log10()+
  stat_compare_means()

genes_info%>%
  mutate(cluster_lasso_label=paste("lasso :", cluster_lasso))%>%
  ggplot(aes(x=cluster_rf, y=expr, 
             fill = cluster_rf==cluster_lasso)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  facet_wrap(~cluster_lasso_label )+ xlab("cluster RF") +
  ggtitle(("Gene expression for all groups")) + scale_y_log10()

genes_info%>%
  mutate(cluster_lasso_label=paste("lasso :", cluster_lasso))%>%
  mutate(agree_on_beneficial_pwms = cluster_lasso == "0.8-1" & cluster_lasso==cluster_rf) %>%
  ggplot(aes(x=agree_on_beneficial_pwms, y=expr, 
             fill = agree_on_beneficial_pwms)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  xlab("cluster RF") +
  ggtitle(("Gene expression : PWM beneficial for both methods vs other groups")) + scale_y_log10()

genes_info%>%
  ggplot(aes(x=cluster_rf, y=expr)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene expression for RF groups")) + scale_y_log10()+
  stat_compare_means()

genes_info%>%
  ggplot(aes(x=cluster_lasso, y=expr)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene expression for LASSO groups")) + scale_y_log10()+
  stat_compare_means()

clusters_info <- genes_info %>%
  group_by(cluster_lasso, cluster_rf) %>%
  summarise(n=n(), 
            tf_frac=sum(is_tf)/n())
## `summarise()` has grouped output by 'cluster_lasso'. You can override using the
## `.groups` argument.
ggplot(clusters_info, aes(x=cluster_rf, y=tf_frac, 
                          label = paste("n=",n),
                          fill = cluster_rf==cluster_lasso)) + 
  geom_bar(stat = "identity")+
  facet_wrap(~cluster_lasso )+
  geom_hline(yintercept = length(tfs)/length(genes)) + 
  geom_text(y=0.22) + xlab("cluster RF") +
  ggtitle("Fraction of TFs in all groups")

genes_info %>%
  group_by(cluster_lasso) %>%
  summarise(n=n(), 
            tf_frac=sum(is_tf)/n()) %>%
  ggplot(aes(x=cluster_lasso, y=tf_frac, 
                          label = paste("n=",n))) + 
  geom_bar(stat = "identity")+
  geom_hline(yintercept = length(tfs)/length(genes)) + 
  geom_text(y=0.16) + xlab("cluster RF") +
  ggtitle("Fraction of TFs in LASSO groups")

genes_info %>%
  group_by(cluster_rf) %>%
  summarise(n=n(), 
            tf_frac=sum(is_tf)/n()) %>%
  ggplot(aes(x=cluster_rf, y=tf_frac, 
                          label = paste("n=",n))) + 
  geom_bar(stat = "identity")+
  geom_hline(yintercept = length(tfs)/length(genes)) + 
  geom_text(y=0.15) + xlab("cluster RF") +
  ggtitle("Fraction of TFs in RF groups")

Est-ce qu’on a significativement plus de TFs dans les genes qui sont bien prédits avec les PWM chez le lasso?

fisher.test(matrix(c(length(genes), length(tfs), 120, 120*0.175), 
                   nrow = 2), alternative = "greater")
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(length(genes), length(tfs), 120, 120 * 0.175), nrow = 2)
## p-value = 0.2257
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  0.7895277       Inf
## sample estimates:
## odds ratio 
##   1.241379

Precision and recall of inferred GRNs subsets for different groups

load("rdata/networks_bRF_lasso.rdata")
prettyZero <- function(l){
    max.decimals = max(nchar(str_extract(l, "\\.[0-9]+")), na.rm = T)-1
    lnew = formatC(l, replace.zero = T, zero.print = "0",
        digits = max.decimals, format = "f", preserve.width=T)
    return(lnew)}
    
all_pwm_pos
##  [1] "AT2G19600" "AT5G65830" "AT2G42280" "AT1G31320" "AT1G22500" "AT2G01670"
##  [7] "AT5G50400" "AT1G60960" "AT3G49570" "AT2G28305" "AT4G35320" "AT2G47700"
## [13] "AT2G33020" "AT4G10150" "AT2G24120" "AT3G61830" "AT2G28850" "AT3G02832"
## [19] "AT3G06110" "AT5G16370" "AT2G46790" "AT3G14720" "AT3G63280" "AT2G34170"
## [25] "AT1G69310" "AT3G62690" "AT1G63880" "AT1G80440" "AT2G25090" "AT5G46050"
## [31] "AT1G30080" "AT5G67480" "AT1G76020" "AT4G37400" "AT2G05590" "AT3G50660"
## [37] "AT1G53530"
draw_precision_recall <- function(genes){
  subset <- lapply(edges, function(net){net[net$to %in% genes,]})


color_palette = c("#88002D", "#C12131", "#EC5D2F", "#FE945C", "#FFC08E" )

    
settings <- c("method", "alpha", "density", "rep")

# number of edges per network
nrows <-
  data.frame(alpha_rep = names(unlist(lapply(edges, FUN = nrow))),
             n_edges = unlist(lapply(subset, FUN = nrow)))
nrows[, settings] <-
  str_split_fixed(nrows$alpha_rep, '_', length(settings))


subset$`LASSO-D3S_0_0.005_1`

val_conn <-
  evaluate_networks(
    subset,
    validation = c("TARGET", "CHIPSeq", "DAPSeq"),
    nCores = 10,
    input_genes = genes,
    input_tfs = tfs
  )
val_conn[, settings] <-
  str_split_fixed(val_conn$network_name, '_', length(settings))

val_connecTF <-  val_conn %>%
  dplyr::select(-network_name) %>%
  mutate(alpha = as.numeric(alpha)) %>%
  ggplot(aes(color = density, x = alpha, y = precision)) +
  ggh4x::facet_nested_wrap(vars(method), ncol = 8, nest_line = T) + geom_point() +
  geom_smooth(aes(fill = density), alpha = 0.1) +
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_text(size = 12),
    legend.position = 'none'
  ) +
  xlab(expression(alpha)) + ylab("Precision") +
  ggtitle("Precision against ConnecTF") +
  scale_x_continuous(labels = prettyZero) +
  scale_color_manual(name = "Network density", values = color_palette) +
  scale_fill_manual(name = "Network density", values = color_palette)
val_connecTF
}

for(group in unique(clusters_lasso)){
  print(draw_precision_recall(names(which(clusters_lasso == group)))+
          ggtitle(paste("LASSO, group", group)))
  print(draw_precision_recall(names(which(clusters_rf == group)))+
          ggtitle(paste("RF, group", group)))
}
## 
## Parallel network validation with AranetBench Using 10 cores.
## 6.026 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## 
## Parallel network validation with AranetBench Using 10 cores.

## 7.554 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## 
## Parallel network validation with AranetBench Using 10 cores.

## 12.184 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## 
## Parallel network validation with AranetBench Using 10 cores.

## 9.503 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## 
## Parallel network validation with AranetBench Using 10 cores.

## 10.342 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## 
## Parallel network validation with AranetBench Using 10 cores.

## 10.897 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## 
## Parallel network validation with AranetBench Using 10 cores.

## 7.642 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## 
## Parallel network validation with AranetBench Using 10 cores.

## 8.258 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

library(DIANE)
## 
## Attaching package: 'DIANE'
## The following object is masked from 'package:igraph':
## 
##     normalize
background <- rownames(gene_annotations$`Arabidopsis thaliana`)
background <- convert_from_agi(background)
## 
for(group in unique(clusters_lasso)){
  
  genes_i <- names(which(clusters_lasso == group))
  
  print(paste("LASSO", length(genes_i), "genes,", group))
  genes_i <- convert_from_agi(genes_i)
  go_lasso <- enrich_go(genes_i, background)
  print(DIANE::draw_enrich_go(go_lasso))
  
  genes_i <- names(which(clusters_rf == group))
  print(paste("RF", length(genes_i), "genes, min mse at", group))
  genes_i <- convert_from_agi(genes_i)
  go_rf <- enrich_go(genes_i, background)
  print(DIANE::draw_enrich_go(go_rf))
  
  common_go <- intersect(go_lasso$ID, go_rf$ID)
  print(go_lasso[common_go,])
}
## [1] "LASSO 120 genes, 0.8-1"
## 
## [1] "RF 272 genes, min mse at 0.8-1"
## [1] ID          Description GeneRatio   BgRatio     pvalue      p.adjust   
## [7] qvalue      geneID      Count      
## <0 rows> (or 0-length row.names)
## [1] "LASSO 635 genes, 0"
## [1] "RF 440 genes, min mse at 0"
##                    ID                                 Description GeneRatio
## GO:0022613 GO:0022613        ribonucleoprotein complex biogenesis    33/532
## GO:0042254 GO:0042254                         ribosome biogenesis    31/532
## GO:0034470 GO:0034470                            ncRNA processing    31/532
## GO:0006364 GO:0006364                             rRNA processing    26/532
## GO:0016072 GO:0016072                      rRNA metabolic process    26/532
## GO:0042273 GO:0042273          ribosomal large subunit biogenesis    11/532
## GO:0015711 GO:0015711                     organic anion transport    11/532
## GO:0015849 GO:0015849                      organic acid transport    11/532
## GO:0019318 GO:0019318                    hexose metabolic process     8/532
## GO:2001057 GO:2001057 reactive nitrogen species metabolic process     6/532
## GO:0042126 GO:0042126                   nitrate metabolic process     5/532
## GO:0042128 GO:0042128                        nitrate assimilation     5/532
## GO:0071941 GO:0071941            nitrogen cycle metabolic process     5/532
## GO:0015695 GO:0015695                    organic cation transport     3/532
##              BgRatio       pvalue     p.adjust       qvalue
## GO:0022613 427/21664 7.407051e-09 2.498028e-06 2.249404e-06
## GO:0042254 349/21664 7.779240e-10 7.432053e-07 6.692356e-07
## GO:0034470 388/21664 9.826013e-09 2.651058e-06 2.387204e-06
## GO:0006364 256/21664 1.101861e-09 7.432053e-07 6.692356e-07
## GO:0016072 268/21664 2.930627e-09 1.317805e-06 1.186647e-06
## GO:0042273  93/21664 1.749132e-05 1.123609e-03 1.011779e-03
## GO:0015711 108/21664 7.113627e-05 3.554179e-03 3.200439e-03
## GO:0015849 115/21664 1.257931e-04 5.107640e-03 4.599287e-03
## GO:0019318  90/21664 1.668753e-03 4.414015e-02 3.974697e-02
## GO:2001057  18/21664 3.078488e-06 3.405256e-04 3.066338e-04
## GO:0042126  13/21664 9.581337e-06 7.180680e-04 6.466002e-04
## GO:0042128  13/21664 9.581337e-06 7.180680e-04 6.466002e-04
## GO:0071941  17/21664 4.246239e-05 2.386740e-03 2.149193e-03
## GO:0015695  11/21664 2.097292e-03 4.963592e-02 4.469576e-02
##                                                                                                                                                                    geneID
## GO:0022613 MEE49/NA/atENP1/RPL7A/NA/NA/EMB2762/NA/AtGHS40/NA/NA/CP33/AtNug2/ATFAS4/ISE4/NA/WDR55/NA/RBL/NA/NA/NA/NA/AtNMD3/NA/REB1/ATFIB2/AtPEIP1/NA/RID2/NA/TOZ/atBRX1-1
## GO:0042254         MEE49/NA/atENP1/RPL7A/NA/NA/EMB2762/NA/AtGHS40/NA/NA/CP33/AtNug2/ATFAS4/NA/WDR55/NA/RBL/NA/NA/NA/AtNMD3/NA/REB1/ATFIB2/AtPEIP1/NA/RID2/NA/TOZ/atBRX1-1
## GO:0034470           MEE49/NA/TRZ3/atENP1/RPL7A/NA/NA/EMB2762/AtGHS40/NA/NA/CP33/ATFAS4/NA/WDR55/NA/NA/NA/NA/REB1/NA/ATFIB2/AtPEIP1/NA/RID2/NA/TOZ/NA/atBRX1-1/NA/AtRPP30
## GO:0006364                                 MEE49/NA/atENP1/RPL7A/NA/NA/EMB2762/AtGHS40/NA/NA/CP33/ATFAS4/NA/WDR55/NA/NA/NA/NA/REB1/ATFIB2/AtPEIP1/NA/RID2/NA/TOZ/atBRX1-1
## GO:0016072                                 MEE49/NA/atENP1/RPL7A/NA/NA/EMB2762/AtGHS40/NA/NA/CP33/ATFAS4/NA/WDR55/NA/NA/NA/NA/REB1/ATFIB2/AtPEIP1/NA/RID2/NA/TOZ/atBRX1-1
## GO:0042273                                                                                                             MEE49/NA/RPL7A/NA/NA/RBL/NA/NA/AtPEIP1/NA/atBRX1-1
## GO:0015711                                                                                        ALMT9/AtmSFC1/APC2/DiT1/ATSDAT/AIT1/ADNT1/ALMT4/AtDTX50/UMAMIT14/AtNiaP
## GO:0015849                                                                                          ALMT9/AtmSFC1/DiT1/ATSDAT/AIT1/NA/ALMT4/AtDTX50/UMAMIT14/SIAR1/AtNiaP
## GO:0019318                                                                                                                  GAPCP-2/G6PD2/PGI/FRK1/AtFBA8/FRK3/REB1/G6PD5
## GO:2001057                                                                                                                        AtUPM1/GLT1/ATGSKB6/NLP7/CML23/ATGLN1;1
## GO:0042126                                                                                                                              AtUPM1/GLT1/ATGSKB6/NLP7/ATGLN1;1
## GO:0042128                                                                                                                              AtUPM1/GLT1/ATGSKB6/NLP7/ATGLN1;1
## GO:0071941                                                                                                                              AtUPM1/GLT1/ATGSKB6/NLP7/ATGLN1;1
## GO:0015695                                                                                                                                              APC2/ADNT1/AtNiaP
##            Count
## GO:0022613    33
## GO:0042254    31
## GO:0034470    31
## GO:0006364    26
## GO:0016072    26
## GO:0042273    11
## GO:0015711    11
## GO:0015849    11
## GO:0019318     8
## GO:2001057     6
## GO:0042126     5
## GO:0042128     5
## GO:0071941     5
## GO:0015695     3
## [1] "LASSO 474 genes, 0.1-0.4"
## [1] "RF 496 genes, min mse at 0.1-0.4"
##                    ID                          Description GeneRatio   BgRatio
## GO:0022613 GO:0022613 ribonucleoprotein complex biogenesis    33/391 427/21664
## GO:0042254 GO:0042254                  ribosome biogenesis    30/391 349/21664
## GO:0034470 GO:0034470                     ncRNA processing    29/391 388/21664
## GO:0006364 GO:0006364                      rRNA processing    25/391 256/21664
##                  pvalue     p.adjust       qvalue
## GO:0022613 2.494920e-12 1.550593e-09 1.445741e-09
## GO:0042254 1.758817e-12 1.550593e-09 1.445741e-09
## GO:0034470 1.238270e-10 3.078339e-08 2.870179e-08
## GO:0006364 8.272055e-12 3.427388e-09 3.195626e-09
##                                                                                                                                                                   geneID
## GO:0022613 NA/ARPF2/AtNOB1/NA/NA/EDA13/NA/APUM23/NA/RH10/AtNLE/DIM1A/NA/UGE3/ATPWP2/AtTRM7b/NA/NA/DG238/RH17/PCP1/SmD1a/AtERG2/AtRH36/NA/NA/RH29/NA/NA/IMP4/NA/EIF2/NOF1
## GO:0042254               NA/ARPF2/AtNOB1/NA/NA/EDA13/NA/APUM23/NA/RH10/AtNLE/DIM1A/NA/UGE3/ATPWP2/AtTRM7b/NA/NA/DG238/RH17/PCP1/AtERG2/AtRH36/NA/RH29/NA/NA/IMP4/NA/NOF1
## GO:0034470           ATIPT3/NA/ARPF2/AtNOB1/NA/NA/EDA13/NA/APUM23/NA/RH10/DIM1A/UGE3/ATPWP2/AtTRM1b/AtTRM7b/NA/NA/PCP1/AtTRM11/AtTRM8a/AtRH36/NA/RH29/NA/NA/IMP4/NA/NOF1
## GO:0006364                                          NA/ARPF2/AtNOB1/NA/NA/EDA13/NA/APUM23/NA/RH10/DIM1A/UGE3/ATPWP2/AtTRM7b/NA/NA/PCP1/AtRH36/NA/RH29/NA/NA/IMP4/NA/NOF1
##            Count
## GO:0022613    33
## GO:0042254    30
## GO:0034470    29
## GO:0006364    25
## [1] "LASSO 197 genes, 0.5-0.7"
## [1] "RF 218 genes, min mse at 0.5-0.7"
##                    ID            Description GeneRatio   BgRatio       pvalue
## GO:0042254 GO:0042254    ribosome biogenesis    11/170 349/21664 1.024780e-04
## GO:0006364 GO:0006364        rRNA processing    10/170 256/21664 3.612792e-05
## GO:0016072 GO:0016072 rRNA metabolic process    10/170 268/21664 5.313390e-05
##               p.adjust      qvalue
## GO:0042254 0.010644906 0.009991608
## GO:0006364 0.008878772 0.008333866
## GO:0016072 0.008878772 0.008333866
##                                                   geneID Count
## GO:0042254 NA/NA/PCP2/NA/EBP2/ATRRP4/RNC4/NA/NA/ARRS1/NA    11
## GO:0006364    NA/PCP2/NA/EBP2/ATRRP4/RNC4/NA/NA/ARRS1/NA    10
## GO:0016072    NA/PCP2/NA/EBP2/ATRRP4/RNC4/NA/NA/ARRS1/NA    10